Notes: scatterplots are the simpliest form of multivarite visualization
Notes:
Notes:
library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
pf<-read.csv('pseudo_facebook.tsv',sep='\t')
ggplot(data=pf,aes(x=age,y=friend_count))+
geom_point()
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) + geom_boxplot()+
stat_summary(fun.y=mean,geom="point",shape=4)
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender)))+
geom_line(aes(color=gender),stat='summary',fun.y=median)
Write code to create a new data frame, called ‘pf.fc_by_age_gender’, that contains information on each age AND gender group.
The data frame should contain the following variables:
mean_friend_count, median_friend_count, n (the number of users in each age and gender grouping)
pf.fc_by_age_gender<-pf%>%
filter(!is.na(gender))%>%
group_by(age,gender)%>%
summarise(mean_friend_count=mean(friend_count),
median_friend_count=median(friend_count),
n=n())
Notes:
ggplot(data=pf.fc_by_age_gender,
aes(x=age,y=median_friend_count,color=gender))+
geom_line()
Notes:
Notes: long formats has many repeats, it is tidy. wide format is cleaner to look at without SQL type data packaging.
Notes:
# install.packages('reshape2')
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
pf.fc_by_age_gender.wide<-dcast(pf.fc_by_age_gender,
age~gender,
value.var='median_friend_count')
head(pf.fc_by_age_gender.wide)
## age female male
## 1 13 148.0 55.0
## 2 14 224.0 92.5
## 3 15 276.0 106.5
## 4 16 258.5 136.0
## 5 17 245.5 125.0
## 6 18 243.0 122.0
Notes:
library(tidyverse)
ggplot(data=pf.fc_by_age_gender.wide,aes(x=age,y=female/male))+
geom_line()+
geom_hline(linetype=2,alpha=.3,yintercept=1)
Notes:
pf<-mutate(pf,year_joined=2014-ceiling((tenure/365)))
# below is garbage version answer they want
# pf$year_joined<-with(pf,2014-ceiling((tenure/365)))
Notes:
summary(pf$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
table(pf$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
pf$year_joined.bucket<-with(pf,cut(year_joined,breaks=c(2004,2009,2011,2012,2014)))
Notes:
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket)))+
geom_line(aes(color=year_joined.bucket),stat='summary',fun.y=median)
Notes:
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(year_joined.bucket)))+
geom_line(aes(color=year_joined.bucket),stat='summary',fun.y=mean)+
geom_line(stat='summary',fun.y=mean,linetype=2)
Notes: friends per day of tenure
with(subset(pf,tenure>0),summary(friend_count/tenure))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0775 0.2205 0.6096 0.5658 417.0000
Notes:
What is the median friend rate?
What is the maximum friend rate?
median friend count is .2205, and max is 417
Create a line graph of mean of friendships_initiated per day (of tenure) vs. tenure colored by year_joined.bucket.
ggplot(data=subset(pf,tenure>0),
aes(x=tenure,y=friendships_initiated/tenure))+
geom_line(stat='summary',fun.y=mean,aes(color=year_joined.bucket))
Notes:
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
## use geom_smooth for same effect
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_smooth(aes(color = year_joined.bucket))
## `geom_smooth()` using method = 'gam'
Notes:the correct average frame is important to use. for NFL, a 7 day moving average makes sense since they usually have 1 game a week.
Notes: there is a history of purchase data over time. lets look at household purchases of 5 flavors of yogurt. many rows per household, one per purchase.
Notes:
yo<-read.csv('yogurt.csv')
yo$id<-factor(yo$id)
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : Factor w/ 332 levels "2100081","2100370",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
ggplot(data=yo,aes(x=price))+
geom_histogram(binwidth=10)
Notes:Create a new variable called all.purchases, which gives the total counts of yogurt for each observation or household.
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
length(yo)
## [1] 9
table(yo$price)
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
yo<-transform(yo,all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry)
ggplot(data=yo,aes(x=all.purchases))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
ggplot(data=yo,aes(x=time,y=price))+
geom_jitter(alpha=1/20)
Notes: we might want to look at a small sample of households for more detail
set.seed(450)
sample.ids<-sample(levels(yo$id),16)
ggplot(data=subset(yo,id %in% sample.ids),
aes(x=time,y=price))+
facet_wrap(~id)+
geom_line()+
geom_point(aes(size=all.purchases),pch=1)
Notes:
Notes:
Notes:
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
set.seed(1836)
pf_subset<-pf[,c(2:15)]
names(pf_subset)
## [1] "age" "dob_day"
## [3] "dob_year" "dob_month"
## [5] "gender" "tenure"
## [7] "friend_count" "friendships_initiated"
## [9] "likes" "likes_received"
## [11] "mobile_likes" "mobile_likes_received"
## [13] "www_likes" "www_likes_received"
ggpairs(pf_subset[sample.int(nrow(pf_subset),1000),])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Notes:
Notes:
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
## gene case value
## 1 1 1 0.300
## 2 2 1 1.180
## 3 3 1 0.550
## 4 4 1 1.140
## 5 5 1 -0.265
## 6 6 1 -0.070
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
Reflection:
Your task is to create a new variable called ‘prop_initiated’ in the Pseudo-Facebook data set. The variable should contain the proportion of friendships that the user initiated.
suppressWarnings(suppressMessages(library(dplyr)))
pf %>%
mutate(prop_initiated=friendships_initiated/friend_count)->pf
ggplot(data=subset(pf,friend_count>0),
aes(x=tenure,y=prop_initiated,color=year_joined.bucket))+
geom_line(stat='summary',fun.y=median)
## Warning: Removed 2 rows containing non-finite values (stat_summary).
ggplot(data=subset(pf,friend_count>0),
aes(x=30 * round(tenure/30),
y=prop_initiated,color=year_joined.bucket))+
geom_line(stat='summary',fun.y=median)
## Warning: Removed 2 rows containing non-finite values (stat_summary).
Mean proportion of friendships initiated by year_joined.bucket
pf%>%
filter(friend_count>0)%>%
group_by(year_joined.bucket)%>%
summarise(mean(prop_initiated))
## # A tibble: 5 x 2
## year_joined.bucket `mean(prop_initiated)`
## <fct> <dbl>
## 1 (2004,2009] 0.467
## 2 (2009,2011] 0.530
## 3 (2011,2012] 0.598
## 4 (2012,2014] 0.665
## 5 <NA> 0.617
In theory, if we assume that people initate the most friendships when the first join, newer users have much more users that they can initiate friendships with.